*-------------------------------------------------------------------------------------------------------*
*RESEARCHERS:		Mette Foged, Linea Hasager and Vasil Yasenov
*PROJECT:			Meta-analysis
*DESCRIPTION:		1  Load database
*					2  Generate additional variables
*					3  Generate datasets with stricter definitions of skill, mix and area approach 
*-------------------------------------------------------------------------------------------------------*


clear all
set more off

use "${data}\Database_estimates_all.dta"



destring estimate wage logwage, replace
rename Country country
drop journal_ranking
rename Rank journal_ranking

*Drop papers not in English, not published nor forthcoming
drop if authors=="DØR"
drop if authors=="Malchow-Møller, Munch and Skaksen" & year==2008

rename wp wp_new
gen wp=1 if wp_new=="1"
drop if wp==1



drop if wp_new=="Don't include"
drop wp_new

*DROP SIMULATION EFFECT SIZES AND FIRM, INDUSTRY LEVEL VARIATION
drop if method=="simulation" | inlist(variation,"structural","Other","t","jt","it","i","iakt")
replace variation="rt" if variation=="rt + imm cohort"

********************
* Data Preparation *
********************

replace se = abs(estimate / t) if se == . & t != .
replace t = estimate / se if t == . & se != .

replace factor = 1 if factor == .
replace estimate = estimate * factor
replace se = se * factor
replace t = t * factor

replace estimate = estimate / mean_foreign_share if log_share == 1
replace t = t / mean_foreign_share if log_share == 1
replace se = se / mean_foreign_share if log_share == 1


replace estimate = - estimate if unemp == 1

replace gender=3 if gender==.

label variable gender "Gender of natives"
label define g 1 "Male" 2 "Female" 3 "Mixture"
label values gender g


/* generate id variable */
egen paper_id = group(authors journal year)
order paper_id

/* generate EU and SCANDINAVIA variables */

gen eu = 1
replace eu = 0 if country == "Turkey" | substr(country,1,13) == "United States" | country == "Israel" ///
| country == "Canada" | country=="Australia" | country=="OECD countries"

gen scandinavia = 0
replace scandinavia = 1 if country == "Norway" | country == "Sweden" |country == "Denmark"

gen restofeu = (eu == 1 & scandinavia == 0)
gen northamerica = substr(country,1,13) == "United States" | country == "Canada"

tab country eu
tab country scandinavia
tab country restofeu
tab country northamerica

/* clean method variable */
replace method = "IV" if method == "2SLS"
replace method = "OLS" if method == "diff-in-diff" |  method == "Diff-in-diff"
tab method, m


drop if method=="simulation"


/* clean skill variable */
replace skill = "High" if skill == "medium" | skill == "high"
replace skill = "Low" if substr(skill, 1,3) == "low"
replace skill = "High" if substr(skill, 1,3) == "med"
replace skill = "all" if skill=="high_low"
tab skill

/* clean data_type */
tab data_frequency
replace data_frequency="Annual or more frequent"	if inlist(data_frequency,"monthly","quarterly") | substr(data_frequency,1,6) ==  "annual"
replace data_frequency="Less frequent" 				if data_frequency !=  "Annual or more frequent"	
tab data_frequency




/* make sure there aren't any papers with too many estimates */
bysort paper_id: gen paper_max = _N
tab authors if paper_max > 30, sort
drop paper_max


/*CREATE EMPLOYMENT DUMMY*/
gen empl=0
replace empl=1 if emp==1 | unemp==1 | logemp==1 | inlist(outcome,"e","e01","u01","p")

*DROP LFP AND OTHER EMPLOYMENT VARIABLES
drop if inlist(outcome,"p","e")

*CLEAN IV TYPE VARIABLE
ta method IV
replace IV="" if !inlist(method,"DIDIV","FDIV","IV")


*Delete unnecessary variables
drop emp logemp unemp Uniquearticles Uniquecountries Employment region Include_estimate Studyinoriginalmetaanalysis Period_covered overall_variation


/*Create region variable*/
gen region="Europe"
replace region="North America" if  country=="Canada"  | country=="United States, Canada" ///
| country=="United States of America" | country=="United States of America, Canada" | country=="United States"
replace region="Rest of the world" if country=="Australia" | country=="Turkey" | country=="Israel" | country=="OECD countries"



/*Replace journal ranking, score etc. with lowest in sample if not published*/
foreach rank in journal_ranking Score SimpleIFrank RecursiveIFrank DiscountedIFrank RecursiveDiscountedIFrank hindexrank {
egen temp=max(`rank')
replace `rank'=temp if `rank'==.
drop temp
}

foreach rank in SimpleIF RecursiveIF DiscountedIF RecursiveDiscountedIF hindex {
egen temp=min(`rank')
replace `rank'=temp if `rank'==.
drop temp
}


*Replace OECD-variables with missing if it's matched more than 5 years after the first year in the data period
gen firstyear=substr(data_period,1,4)
destring firstyear, replace

foreach var in ALMPExp AverageTenure CollBarg ///
StrictEmpIndColl StrictEmpInd StrictEmpColl StrictEmpTemp TradeUnionDens /// 
Unemployment GDPgrowth OutputGap MintoMedianWage MinToMeanWage NetReplacementRate BTE BTTI StateControl PMR {
gen diff_`var'=`var'_year-firstyear
}

summ diff_ALMPExp diff_AverageTenure diff_CollBarg diff_StrictEmpIndColl diff_StrictEmpInd diff_StrictEmpColl diff_NetReplacementRate ///
diff_StrictEmpTemp diff_TradeUnionDens diff_Unemployment diff_GDPgrowth diff_OutputGap diff_MintoMedianWage diff_MinToMeanWage diff_BTE ///
diff_BTTI diff_StateControl diff_PMR


*replace oecd-variables with missing if they are measured more than 5 years after start of the data period
foreach var in ALMPExp AverageTenure CollBarg ///
StrictEmpIndColl StrictEmpInd StrictEmpColl StrictEmpTemp TradeUnionDens /// 
Unemployment GDPgrowth OutputGap MintoMedianWage MinToMeanWage NetReplacementRate BTE BTTI StateControl PMR {
replace `var'=. if diff_`var' > 5 | diff_`var' < -5
}

*check how exact match is for wage estimates
foreach var in CollBarg StrictEmpInd AverageTenure ALMPExp StateControl {
gen difd=0
replace difd=1 if diff_`var'==0
tab difd if wage==1
drop difd
}


drop diff_ALMPExp-diff_PMR


*Rescale share covered by collective bargaining agreements, and net replacement rate
replace CollBarg=CollBarg/100
replace CollBarg_avg=CollBarg_avg/100
replace TradeUnionDens=TradeUnionDens/100
replace TradeUnionDens_avg=TradeUnionDens_avg/100
replace NetReplacementRate=NetReplacementRate/100
replace NetReplacementRate_avg=NetReplacementRate_avg/100

*Relative skill distributions og immigrants and natives in 2000 and 2010
gen ratio_f_2000=Foreign_low_2000/(Foreign_low_2000+Foreign_medium_2000+Foreign_high_2000)
gen ratio_f_2010=Foreign_low_2010/(Foreign_low_2010+Foreign_medium_2010+Foreign_high_2010)
gen ratio_n_2000=Native_low_2000/(Native_low_2000+Native_medium_2000+Native_high_2000)
gen ratio_n_2010=Native_low_2010/(Native_low_2010+Native_medium_2010+Native_high_2010)

gen ratio_f_n_2000=ratio_f_2000/ratio_n_2000
gen ratio_f_n_2010=ratio_f_2010/ratio_n_2010
label variable ratio_f_n_2000 "Ratio of share of low skilled immigrants and share of low skilled natives 2000"
label variable ratio_f_n_2010 "Ratio of share of low skilled immigrants and share of low skilled natives 2010"
gen ratio_diff2000=abs(2000-firstyear)
gen ratio_diff2010=abs(2010-firstyear)

gen ratio_f_n=ratio_f_n_2000
replace ratio_f_n=ratio_f_n_2010 if ratio_diff2010 < ratio_diff2000
drop ratio_f_2000-ratio_diff2010
label variable ratio_f_n "Ratio of share of low skilled immigrants and share of low skilled natives"



*Label variables
label variable ALMPExp "Public expenditure ALMP" 
label variable AverageTenure "Average tenure"
label variable CollBarg "Collective bargaining coverage"
label variable StrictEmpColl "EPL (collective)"
label variable StrictEmpIndColl "EPL (individual and collective)"
label variable StrictEmpInd "EPL"
label variable StrictEmpTemp "EPL (temporary)"
label variable TradeUnionDens "Trade union density"
label variable MintoMedianWage "Min./median wage"
label variable MinToMeanWage "Min./mean wage"
label variable NetReplacementRate "Net replacement rate"
label variable Unemployment "Unemployment rate"
label variable GDPgrowth "GDP growth"
label variable OutputGap "Output gap"
label variable BTE "Barriers to entrepreneurship"
label variable BTTI "Barriers to trade and investment"
label variable StateControl "State control of businesses"
label variable PMR "Product market regulation"

label variable ALMPExp_year "Year of Public expenditure ALMP" 
label variable AverageTenure_year "Year of Average tenure"
label variable CollBarg_year "Year of Collective bargaining coverage"
label variable StrictEmpColl_year "Year of EPL (collective)"
label variable StrictEmpIndColl_year "Year of EPL (individual and collective)"
label variable StrictEmpInd_year "Year of EPL"
label variable StrictEmpTemp_year "Year of EPL (temporary)"
label variable TradeUnionDens_year "Year of Trade union density"
label variable MintoMedianWage_year "Year of Min./median wage"
label variable MinToMeanWage_year "Year of Min./mean wage"
label variable NetReplacementRate_year "Year of Net replacement rate"
label variable Unemployment_year "Year of Unemployment rate"
label variable GDPgrowth_year "Year of GDP growth"
label variable OutputGap_year "Year of output gap"
label variable BTE_year "Year of Barriers to entrepreneurship"
label variable BTTI_year "Year of Barriers to trade and investment"
label variable StateControl_year "Year of State control of businesses"
label variable PMR_year "Year of Product market regulation"

label variable Native_low_2010 "Number of low skilled natives 2010"
label variable Native_medium_2010 "Number of medium skilled natives 2010"
label variable Native_high_2010 "Number of high skilled natives 2010"
label variable Foreign_low_2010 "Number of low skilled foreigners 2010"
label variable Foreign_medium_2010 "Number of medium skilled foreigners 2010"
label variable Foreign_high_2010 "Number of high skilled foreigners 2010"
label variable Native_low_2000 "Number of low skilled natives 2000"
label variable Native_medium_2000 "Number of medium skilled natives 2000"
label variable Native_high_2000 "Number of high skilled natives 2000"
label variable Foreign_low_2000 "Number of low skilled foreigners 2000"
label variable Foreign_medium_2000 "Number of medium skilled foreigners 2000"
label variable Foreign_high_2000 "Number of high skilled foreigners 2000"

label variable eu "Europe"
label variable northamerica "North America"
label variable scandinavia "Scandinavia"
label variable restofeu "Rest of Europe"
label variable Euclid "Euclidian citation score (RePEc)"
label variable hindex "h-index (RePEc)"
label variable wp "Working paper"
label variable wage "Wage effect"
label variable empl "Employment effect"
label variable region "Geographical region"
label variable firstyear "First year of data period"



*CREATE NEW INSTITUTIONAL VARIABLE INCLUDING CROSS-COUNTRY MEANS, AND SET CROSS-COUNTRY INSTITUTIONS TO MISSING IN MAIN VARIABLE
foreach inst of varlist AverageTenure-NetReplacementRate {
gen 	`inst'_cc=`inst'
gen 	`inst'_avg_cc=`inst'_avg
replace `inst'=. 		if inlist(country,"United States, Canada","European Economic Area","OECD countries","Western Europe")
replace `inst'_avg=. 	if inlist(country,"United States, Canada","European Economic Area","OECD countries","Western Europe")
}

*drop unnecessary variables
drop AverageTenure_year-Foreign_high_2000


gen id_estimate=_n

******************************************************************
********OBS! DEFINE VARIATION LIKE DUSTMANN ET. AL 2016***********
******************************************************************
/* clean variation variable */
 
preserve
replace variation = "Mixture" 	if inlist(variation,"gakrt","gart","grt","ort","or","gakr","gr")
replace variation = "Skill" 	if inlist(variation,"gat","ot","o","oat")
replace variation = "Skill" 	if inlist(variation,"gakt","gak")
replace variation = "Area"	 	if inlist(variation,"rt","r","rt + imm cohort")
replace variation = "other" if (authors == "Peri and Sparber") | (authors == "Borjas" & year == 1987)
tab variation, m

drop if (variation!="Area" & variation!="Skill" & variation!="Mixture")
drop if method=="simulation"

*g: skill group. a: experience/age groups. k: gender. r: area/region. t: time. o: occupations. ///
*i: industry. j: firm.


/* save cleaned data */
save "${data}\database_clean_robust.dta", replace
restore

merge 1:1 id_estimate using "${data}\database_clean_robust.dta"
gen excl_firm_industry=1 if _merge==1
drop _merge



/* clean variation variable */
replace variation = "Mixture" 	if inlist(variation,"gakrt","gart","grt","ort","irt","or","gakr","gr")
replace variation = "Skill" 	if inlist(variation,"gat"/*,"jt"*/,"ot"/*,"i"*/,"iot"/*,"it"*/,"o","oat")
replace variation = "Skill" 	if inlist(variation,"gakt"/*,"iakt"*/,"gak"/*,"structural"*/)
replace variation = "Area"	 	if inlist(variation,"rt","r","rt + imm cohort")
replace variation = "Other" 	if inlist(variation,"structural","Other","t")
tab variation, m

drop if (variation!="Area" & variation!="Skill" & variation!="Mixture")
drop if method=="simulation"

**********************************************************
* ADD INDEX OF STANDARIZED INSTITUTION VALUES IF ALL ARE NON-MISSING

merge m:1 country using "$data\OECD_means_2.dta", keepusing(z_* index pc1 d_*)
drop if _merge==2
drop _merge





drop if abs(estimate) > 3

*save data
save "${data}\database_clean_robustness.dta", replace
